set.seed(23) # for random functions
prettyTable <- function(table_df, round_columns=numeric(), round_digits=2) {
DT::datatable(table_df, style="bootstrap", filter = "top", rownames = FALSE, extensions = "Buttons", options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print'))) %>%
formatRound(round_columns, round_digits)
} # same look of tablestempFilePath <- paste(working_dir,"/../data/temp.7z",sep='')
dataFilePath <- paste(working_dir,"/../data/all_summary.csv",sep='')
if (!file.exists(dataFilePath)){
message("Downloading data from: ",params$source_url);
download.file(params$source_url,tempFilePath)
archive_extract(tempFilePath,"./data");
if (!file.exists(dataFilePath)){
stop("Data file not found");
}
} else {
message("Data was downloaded previously. Be careful!")
}## Data was downloaded previously. Be careful!
sampleRowsNo <- 1000
sample <- read.table(dataFilePath,header = TRUE, nrows = sampleRowsNo, sep = ";");noTopClasses <- 50
#target class for classification
attrib.target_class <- "res_name"
#all attributes
attrib.all <- colnames(data)
#"local" attributes
attrib.local <- attrib.all[ grepl("local_", attrib.all)]
# dict_atom attribs
attrib.dict <- attrib.all[grepl("dict_atom_", attrib.all)]
# parts
attrib.part <- attrib.all[grepl("part_",attrib.all)]
# skeleton
attrib.skeleton <- attrib.all[grepl("skeleton_",attrib.all)]
# resolution
attrib.res <- "resolution"
# params
attrib.params <- c("fo_col","fc_col","weight_col","grid_space","solvent_radius","solvent_opening_radius")
#uknown columns
attrib.unknown <- c("blob_coverage","blob_volume_coverage_second","resolution_max_limit","FoFc_square_std","res_coverage","res_volume_coverage","FoFc_mean","FoFc_min","blob_volume_coverage","res_volume_coverage_second","FoFc_std","FoFc_max","resolution")
# illegal attribs for classifications
attrib.illegal <- c(c(
"title",
"pdb_code",
"res_name",
"res_id",
"chain_id",
"local_",
"weight_col", #is na!
"skeleton_data"
),
#attrib.local, #local are illegal,
#attrib.dict, #dicts are illegal,
attrib.unknown,
attrib.params
)
attrib.legal<-setdiff(attrib.all, c(attrib.illegal,attrib.local, attrib.dict))
attrib.part.shape <- attrib.legal[grepl("_shape_",attrib.legal)]
attrib.part.density <- attrib.legal[grepl("_density_",attrib.legal)]excluded_names <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT");
data.raw <- data %>%
filter(!res_name %in% excluded_names) %>%
filter(!is.na(res_name))replacements <<- data.raw %>%
select(c(attrib.legal,attrib.target_class),-c("skeleton_data")) %>%
na.omit %>%
group_by(res_name) %>%
summarize_all(funs(mean))
ids <- data.raw[,attrib.target_class]
#?mutate
for (col in colnames(replacements)){
column<-data.raw[,col]
bad <- is.na(column)
bad_ids = ids[bad]
if (sum(bad)==0){
next
}
bad_ids <- data.frame(bad_ids)
colnames(bad_ids)<-c(attrib.target_class)
vals <- data.frame(bad_ids) %>% left_join(replacements) %>% select(!!treat_string_as_col(col))
new_vals <- unlist(vals, use.names = FALSE)
data.raw[bad,col] <- new_vals
}
data.withoutGaps <- data.raw %>% replace(is.na(.),0)## Number of rows: 996
## Number of attributes: 412
## Number of legal attributes: 412
## vars n mean
## blob_coverage* 1 996 4.985700e+02
## res_coverage* 2 996 4.992900e+02
## title* 3 996 4.224800e+02
## pdb_code* 4 996 7.311000e+01
## res_name* 5 996 8.688000e+01
## res_id 6 996 7.320700e+02
## chain_id* 7 996 2.270000e+00
## blob_volume_coverage 8 996 6.400000e-01
## blob_volume_coverage_second 9 996 2.000000e-02
## res_volume_coverage 10 996 5.100000e-01
## res_volume_coverage_second 11 996 8.000000e-02
## local_res_atom_count 12 996 1.489000e+01
## local_res_atom_non_h_count 13 996 1.468000e+01
## local_res_atom_non_h_occupancy_sum 14 996 1.387000e+01
## local_res_atom_non_h_electron_sum 15 996 1.096800e+02
## local_res_atom_non_h_electron_occupancy_sum 16 996 1.025500e+02
## local_res_atom_C_count 17 996 8.670000e+00
## local_res_atom_N_count 18 996 1.440000e+00
## local_res_atom_O_count 19 996 3.680000e+00
## local_res_atom_S_count 20 996 2.200000e-01
## dict_atom_non_h_count 21 996 1.465000e+01
## dict_atom_non_h_electron_sum 22 996 1.095400e+02
## dict_atom_C_count 23 996 8.600000e+00
## dict_atom_N_count 24 996 1.390000e+00
## dict_atom_O_count 25 996 3.810000e+00
## dict_atom_S_count 26 996 2.200000e-01
## skeleton_data* 27 996 5.008200e+02
## skeleton_cycle_4 28 996 2.090000e+00
## skeleton_diameter 29 996 2.433000e+01
## skeleton_cycle_6 30 996 8.000000e-02
## skeleton_cycle_7 31 996 5.000000e-02
## skeleton_closeness_006_008 32 996 2.140000e+00
## skeleton_closeness_002_004 33 996 2.000000e-02
## skeleton_cycle_3 34 996 6.700000e-01
## skeleton_avg_degree 35 996 1.570000e+00
## skeleton_closeness_004_006 36 996 7.600000e-01
## skeleton_closeness_010_012 37 996 3.220000e+00
## skeleton_closeness_012_014 38 996 3.390000e+00
## skeleton_edges 39 996 4.115000e+01
## skeleton_radius 40 996 1.247000e+01
## skeleton_cycle_8_plus 41 996 1.450000e+00
## skeleton_closeness_020_030 42 996 5.770000e+00
## skeleton_deg_5_plus 43 996 2.090000e+00
## skeleton_closeness_016_018 44 996 2.500000e+00
## skeleton_closeness_008_010 45 996 3.060000e+00
## skeleton_closeness_018_020 46 996 2.130000e+00
## skeleton_average_clustering 47 996 0.000000e+00
## skeleton_closeness_040_050 48 996 3.340000e+00
## skeleton_closeness_014_016 49 996 3.090000e+00
## skeleton_center 50 996 1.800000e+00
## skeleton_closeness_000_002 51 996 1.100000e-01
## skeleton_density 52 996 2.300000e-01
## skeleton_closeness_030_040 53 996 3.140000e+00
## skeleton_deg_4 54 996 2.100000e-01
## skeleton_deg_0 55 996 1.100000e-01
## skeleton_deg_1 56 996 3.140000e+00
## skeleton_deg_2 57 996 3.029000e+01
## skeleton_deg_3 58 996 1.700000e+00
## skeleton_graph_clique_number 59 996 1.900000e+00
## skeleton_nodes 60 996 3.754000e+01
## skeleton_cycles 61 996 4.610000e+00
## skeleton_cycle_5 62 996 2.500000e-01
## skeleton_closeness_050_plus 63 996 4.870000e+00
## skeleton_periphery 64 996 1.970000e+00
## local_volume 65 996 8.316600e+02
## local_electrons 66 996 1.890000e+01
## local_mean 67 996 2.000000e-02
## local_std 68 996 1.200000e-01
## local_min 69 996 0.000000e+00
## local_max 70 996 1.350000e+00
## local_max_over_std 71 996 1.014000e+01
## local_skewness 72 996 2.200000e-01
## local_cut_by_mainchain_volume 73 996 4.900000e-01
## local_near_cut_count_C 74 996 4.860000e+00
## local_near_cut_count_other 75 996 3.000000e-02
## local_near_cut_count_S 76 996 1.700000e-01
## sd median
## blob_coverage* 2.882600e+02 4.975000e+02
## res_coverage* 2.886100e+02 4.985000e+02
## title* 2.434500e+02 4.335000e+02
## pdb_code* 4.264000e+01 8.100000e+01
## res_name* 3.889000e+01 9.100000e+01
## res_id 8.617600e+02 5.010000e+02
## chain_id* 2.160000e+00 1.500000e+00
## blob_volume_coverage 2.500000e-01 7.000000e-01
## blob_volume_coverage_second 5.000000e-02 0.000000e+00
## res_volume_coverage 3.000000e-01 4.700000e-01
## res_volume_coverage_second 2.200000e-01 0.000000e+00
## local_res_atom_count 1.502000e+01 8.000000e+00
## local_res_atom_non_h_count 1.480000e+01 8.000000e+00
## local_res_atom_non_h_occupancy_sum 1.444000e+01 7.000000e+00
## local_res_atom_non_h_electron_sum 9.798000e+01 6.400000e+01
## local_res_atom_non_h_electron_occupancy_sum 9.649000e+01 4.800000e+01
## local_res_atom_C_count 1.097000e+01 4.000000e+00
## local_res_atom_N_count 2.160000e+00 0.000000e+00
## local_res_atom_O_count 3.570000e+00 4.000000e+00
## local_res_atom_S_count 5.200000e-01 0.000000e+00
## dict_atom_non_h_count 1.490000e+01 7.000000e+00
## dict_atom_non_h_electron_sum 9.855000e+01 5.600000e+01
## dict_atom_C_count 1.109000e+01 3.000000e+00
## dict_atom_N_count 2.160000e+00 0.000000e+00
## dict_atom_O_count 3.570000e+00 4.000000e+00
## dict_atom_S_count 5.100000e-01 0.000000e+00
## skeleton_data* 2.889300e+02 5.005000e+02
## skeleton_cycle_4 3.095000e+01 0.000000e+00
## skeleton_diameter 2.990000e+01 1.300000e+01
## skeleton_cycle_6 1.350000e+00 0.000000e+00
## skeleton_cycle_7 8.300000e-01 0.000000e+00
## skeleton_closeness_006_008 1.262000e+01 0.000000e+00
## skeleton_closeness_002_004 5.100000e-01 0.000000e+00
## skeleton_cycle_3 1.063000e+01 0.000000e+00
## skeleton_avg_degree 6.900000e-01 1.860000e+00
## skeleton_closeness_004_006 8.880000e+00 0.000000e+00
## skeleton_closeness_010_012 9.970000e+00 0.000000e+00
## skeleton_closeness_012_014 9.740000e+00 0.000000e+00
## skeleton_edges 1.150300e+02 1.300000e+01
## skeleton_radius 1.501000e+01 7.000000e+00
## skeleton_cycle_8_plus 2.200000e+01 0.000000e+00
## skeleton_closeness_020_030 1.219000e+01 0.000000e+00
## skeleton_deg_5_plus 3.323000e+01 0.000000e+00
## skeleton_closeness_016_018 6.370000e+00 0.000000e+00
## skeleton_closeness_008_010 1.259000e+01 0.000000e+00
## skeleton_closeness_018_020 5.560000e+00 0.000000e+00
## skeleton_average_clustering 1.000000e-02 0.000000e+00
## skeleton_closeness_040_050 2.793000e+01 0.000000e+00
## skeleton_closeness_014_016 8.740000e+00 0.000000e+00
## skeleton_center 3.980000e+00 1.000000e+00
## skeleton_closeness_000_002 3.200000e-01 0.000000e+00
## skeleton_density 3.100000e-01 9.000000e-02
## skeleton_closeness_030_040 1.487000e+01 0.000000e+00
## skeleton_deg_4 2.040000e+00 0.000000e+00
## skeleton_deg_0 3.200000e-01 0.000000e+00
## skeleton_deg_1 3.090000e+00 2.000000e+00
## skeleton_deg_2 4.517000e+01 1.200000e+01
## skeleton_deg_3 3.650000e+00 0.000000e+00
## skeleton_graph_clique_number 3.500000e-01 2.000000e+00
## skeleton_nodes 6.088000e+01 1.400000e+01
## skeleton_cycles 6.936000e+01 0.000000e+00
## skeleton_cycle_5 3.850000e+00 0.000000e+00
## skeleton_closeness_050_plus 5.540000e+00 2.000000e+00
## skeleton_periphery 5.800000e-01 2.000000e+00
## local_volume 1.211220e+03 3.520800e+02
## local_electrons 2.521000e+01 8.770000e+00
## local_mean 2.000000e-02 2.000000e-02
## local_std 1.000000e-01 1.000000e-01
## local_min 0.000000e+00 0.000000e+00
## local_max 1.900000e+00 8.400000e-01
## local_max_over_std 9.030000e+00 7.310000e+00
## local_skewness 2.000000e-01 1.700000e-01
## local_cut_by_mainchain_volume 1.160000e+00 0.000000e+00
## local_near_cut_count_C 5.940000e+00 3.000000e+00
## local_near_cut_count_other 2.200000e-01 0.000000e+00
## local_near_cut_count_S 6.600000e-01 0.000000e+00
## trimmed mad
## blob_coverage* 4.983300e+02 3.699100e+02
## res_coverage* 4.991100e+02 3.706500e+02
## title* 4.225200e+02 3.076400e+02
## pdb_code* 7.410000e+01 5.486000e+01
## res_name* 8.846000e+01 4.300000e+01
## res_id 6.011800e+02 4.092000e+02
## chain_id* 1.760000e+00 7.400000e-01
## blob_volume_coverage 6.500000e-01 2.500000e-01
## blob_volume_coverage_second 0.000000e+00 0.000000e+00
## res_volume_coverage 4.900000e-01 3.400000e-01
## res_volume_coverage_second 1.000000e-02 0.000000e+00
## local_res_atom_count 1.288000e+01 1.038000e+01
## local_res_atom_non_h_count 1.263000e+01 1.038000e+01
## local_res_atom_non_h_occupancy_sum 1.166000e+01 8.900000e+00
## local_res_atom_non_h_electron_sum 9.654000e+01 5.041000e+01
## local_res_atom_non_h_electron_occupancy_sum 8.832000e+01 4.300000e+01
## local_res_atom_C_count 6.590000e+00 5.930000e+00
## local_res_atom_N_count 1.030000e+00 0.000000e+00
## local_res_atom_O_count 3.080000e+00 2.970000e+00
## local_res_atom_S_count 1.200000e-01 0.000000e+00
## dict_atom_non_h_count 1.263000e+01 8.900000e+00
## dict_atom_non_h_electron_sum 9.683000e+01 5.337000e+01
## dict_atom_C_count 6.490000e+00 4.450000e+00
## dict_atom_N_count 9.600000e-01 0.000000e+00
## dict_atom_O_count 3.250000e+00 2.970000e+00
## dict_atom_S_count 1.100000e-01 0.000000e+00
## skeleton_data* 5.008100e+02 3.713900e+02
## skeleton_cycle_4 0.000000e+00 0.000000e+00
## skeleton_diameter 1.862000e+01 1.779000e+01
## skeleton_cycle_6 0.000000e+00 0.000000e+00
## skeleton_cycle_7 0.000000e+00 0.000000e+00
## skeleton_closeness_006_008 0.000000e+00 0.000000e+00
## skeleton_closeness_002_004 0.000000e+00 0.000000e+00
## skeleton_cycle_3 0.000000e+00 0.000000e+00
## skeleton_avg_degree 1.680000e+00 2.000000e-01
## skeleton_closeness_004_006 0.000000e+00 0.000000e+00
## skeleton_closeness_010_012 3.000000e-01 0.000000e+00
## skeleton_closeness_012_014 6.500000e-01 0.000000e+00
## skeleton_edges 2.369000e+01 1.779000e+01
## skeleton_radius 9.650000e+00 8.900000e+00
## skeleton_cycle_8_plus 0.000000e+00 0.000000e+00
## skeleton_closeness_020_030 2.560000e+00 0.000000e+00
## skeleton_deg_5_plus 0.000000e+00 0.000000e+00
## skeleton_closeness_016_018 7.400000e-01 0.000000e+00
## skeleton_closeness_008_010 0.000000e+00 0.000000e+00
## skeleton_closeness_018_020 6.500000e-01 0.000000e+00
## skeleton_average_clustering 0.000000e+00 0.000000e+00
## skeleton_closeness_040_050 7.800000e-01 0.000000e+00
## skeleton_closeness_014_016 7.400000e-01 0.000000e+00
## skeleton_center 1.480000e+00 0.000000e+00
## skeleton_closeness_000_002 2.000000e-02 0.000000e+00
## skeleton_density 1.600000e-01 1.100000e-01
## skeleton_closeness_030_040 1.180000e+00 0.000000e+00
## skeleton_deg_4 0.000000e+00 0.000000e+00
## skeleton_deg_0 2.000000e-02 0.000000e+00
## skeleton_deg_1 2.630000e+00 0.000000e+00
## skeleton_deg_2 2.023000e+01 1.779000e+01
## skeleton_deg_3 8.200000e-01 0.000000e+00
## skeleton_graph_clique_number 1.980000e+00 0.000000e+00
## skeleton_nodes 2.454000e+01 1.779000e+01
## skeleton_cycles 3.000000e-02 0.000000e+00
## skeleton_cycle_5 0.000000e+00 0.000000e+00
## skeleton_closeness_050_plus 4.200000e+00 2.970000e+00
## skeleton_periphery 1.980000e+00 0.000000e+00
## local_volume 5.478400e+02 2.686800e+02
## local_electrons 1.335000e+01 9.730000e+00
## local_mean 2.000000e-02 1.000000e-02
## local_std 1.100000e-01 5.000000e-02
## local_min 0.000000e+00 0.000000e+00
## local_max 1.020000e+00 5.200000e-01
## local_max_over_std 8.410000e+00 3.950000e+00
## local_skewness 1.900000e-01 8.000000e-02
## local_cut_by_mainchain_volume 1.700000e-01 0.000000e+00
## local_near_cut_count_C 3.720000e+00 4.450000e+00
## local_near_cut_count_other 0.000000e+00 0.000000e+00
## local_near_cut_count_S 0.000000e+00 0.000000e+00
## min max
## blob_coverage* 1.00 9.990000e+02
## res_coverage* 1.00 1.000000e+03
## title* 1.00 8.490000e+02
## pdb_code* 1.00 1.360000e+02
## res_name* 1.00 1.510000e+02
## res_id 1.00 9.002000e+03
## chain_id* 1.00 1.600000e+01
## blob_volume_coverage 0.03 1.000000e+00
## blob_volume_coverage_second 0.00 3.700000e-01
## res_volume_coverage 0.01 1.000000e+00
## res_volume_coverage_second 0.00 1.000000e+00
## local_res_atom_count 1.00 5.300000e+01
## local_res_atom_non_h_count 1.00 5.300000e+01
## local_res_atom_non_h_occupancy_sum 0.10 5.300000e+01
## local_res_atom_non_h_electron_sum 12.00 3.840000e+02
## local_res_atom_non_h_electron_occupancy_sum 4.80 3.840000e+02
## local_res_atom_C_count 0.00 3.800000e+01
## local_res_atom_N_count 0.00 1.000000e+01
## local_res_atom_O_count 0.00 1.700000e+01
## local_res_atom_S_count 0.00 4.000000e+00
## dict_atom_non_h_count 0.00 5.500000e+01
## dict_atom_non_h_electron_sum 0.00 3.840000e+02
## dict_atom_C_count 0.00 5.000000e+01
## dict_atom_N_count 0.00 1.000000e+01
## dict_atom_O_count 0.00 1.700000e+01
## dict_atom_S_count 0.00 4.000000e+00
## skeleton_data* 1.00 1.000000e+03
## skeleton_cycle_4 0.00 6.300000e+02
## skeleton_diameter 0.00 1.760000e+02
## skeleton_cycle_6 0.00 2.700000e+01
## skeleton_cycle_7 0.00 1.700000e+01
## skeleton_closeness_006_008 0.00 1.600000e+02
## skeleton_closeness_002_004 0.00 1.600000e+01
## skeleton_cycle_3 0.00 1.910000e+02
## skeleton_avg_degree 0.00 5.980000e+00
## skeleton_closeness_004_006 0.00 2.070000e+02
## skeleton_closeness_010_012 0.00 6.900000e+01
## skeleton_closeness_012_014 0.00 9.500000e+01
## skeleton_edges 0.00 1.996000e+03
## skeleton_radius 0.00 8.800000e+01
## skeleton_cycle_8_plus 0.00 4.240000e+02
## skeleton_closeness_020_030 0.00 8.600000e+01
## skeleton_deg_5_plus 0.00 6.330000e+02
## skeleton_closeness_016_018 0.00 5.600000e+01
## skeleton_closeness_008_010 0.00 1.090000e+02
## skeleton_closeness_018_020 0.00 6.000000e+01
## skeleton_average_clustering 0.00 1.200000e-01
## skeleton_closeness_040_050 0.00 5.200000e+02
## skeleton_closeness_014_016 0.00 9.200000e+01
## skeleton_center 1.00 8.400000e+01
## skeleton_closeness_000_002 0.00 1.000000e+00
## skeleton_density 0.00 1.000000e+00
## skeleton_closeness_030_040 0.00 4.300000e+02
## skeleton_deg_4 0.00 3.600000e+01
## skeleton_deg_0 0.00 1.000000e+00
## skeleton_deg_1 0.00 3.100000e+01
## skeleton_deg_2 0.00 3.200000e+02
## skeleton_deg_3 0.00 3.500000e+01
## skeleton_graph_clique_number 1.00 4.000000e+00
## skeleton_nodes 1.00 6.680000e+02
## skeleton_cycles 0.00 1.329000e+03
## skeleton_cycle_5 0.00 6.700000e+01
## skeleton_closeness_050_plus 0.00 2.100000e+01
## skeleton_periphery 1.00 1.200000e+01
## local_volume 97.34 9.673280e+03
## local_electrons 0.47 1.713300e+02
## local_mean 0.00 2.400000e-01
## local_std 0.02 9.500000e-01
## local_min 0.00 0.000000e+00
## local_max 0.19 2.716000e+01
## local_max_over_std 3.13 1.017200e+02
## local_skewness 0.04 2.360000e+00
## local_cut_by_mainchain_volume 0.00 1.013000e+01
## local_near_cut_count_C 0.00 4.100000e+01
## local_near_cut_count_other 0.00 3.000000e+00
## local_near_cut_count_S 0.00 6.000000e+00
## range skew kurtosis
## blob_coverage* 9.980000e+02 0.01 -1.20
## res_coverage* 9.990000e+02 0.00 -1.20
## title* 8.480000e+02 -0.02 -1.18
## pdb_code* 1.350000e+02 -0.22 -1.34
## res_name* 1.500000e+02 -0.21 -0.89
## res_id 9.001000e+03 4.91 34.69
## chain_id* 1.500000e+01 2.91 10.13
## blob_volume_coverage 9.700000e-01 -0.56 -0.83
## blob_volume_coverage_second 3.700000e-01 3.59 13.82
## res_volume_coverage 9.900000e-01 0.34 -1.07
## res_volume_coverage_second 1.000000e+00 2.97 7.82
## local_res_atom_count 5.200000e+01 1.03 -0.34
## local_res_atom_non_h_count 5.200000e+01 1.05 -0.26
## local_res_atom_non_h_occupancy_sum 5.290000e+01 1.14 0.03
## local_res_atom_non_h_electron_sum 3.720000e+02 1.08 -0.23
## local_res_atom_non_h_electron_occupancy_sum 3.792000e+02 1.17 0.06
## local_res_atom_C_count 3.800000e+01 1.34 0.58
## local_res_atom_N_count 1.000000e+01 1.41 1.08
## local_res_atom_O_count 1.700000e+01 1.51 2.56
## local_res_atom_S_count 4.000000e+00 3.48 18.49
## dict_atom_non_h_count 5.500000e+01 1.01 -0.31
## dict_atom_non_h_electron_sum 3.840000e+02 1.04 -0.24
## dict_atom_C_count 5.000000e+01 1.33 0.56
## dict_atom_N_count 1.000000e+01 1.48 1.27
## dict_atom_O_count 1.700000e+01 1.47 2.59
## dict_atom_S_count 4.000000e+00 3.59 19.41
## skeleton_data* 9.990000e+02 0.00 -1.21
## skeleton_cycle_4 6.300000e+02 16.52 281.54
## skeleton_diameter 1.760000e+02 1.76 3.14
## skeleton_cycle_6 2.700000e+01 16.50 279.54
## skeleton_cycle_7 1.700000e+01 17.76 328.34
## skeleton_closeness_006_008 1.600000e+02 7.58 65.71
## skeleton_closeness_002_004 1.600000e+01 31.11 973.72
## skeleton_cycle_3 1.910000e+02 15.87 252.26
## skeleton_avg_degree 5.980000e+00 -0.33 6.04
## skeleton_closeness_004_006 2.070000e+02 17.65 357.99
## skeleton_closeness_010_012 6.900000e+01 3.62 13.59
## skeleton_closeness_012_014 9.500000e+01 3.90 19.92
## skeleton_edges 1.996000e+03 11.74 168.03
## skeleton_radius 8.800000e+01 1.74 3.04
## skeleton_cycle_8_plus 4.240000e+02 16.18 265.96
## skeleton_closeness_020_030 8.600000e+01 2.29 5.09
## skeleton_deg_5_plus 6.330000e+02 16.10 262.37
## skeleton_closeness_016_018 5.600000e+01 3.18 12.83
## skeleton_closeness_008_010 1.090000e+02 5.10 28.09
## skeleton_closeness_018_020 6.000000e+01 4.08 27.02
## skeleton_average_clustering 1.200000e-01 15.60 243.82
## skeleton_closeness_040_050 5.200000e+02 16.46 279.26
## skeleton_closeness_014_016 9.200000e+01 4.39 27.66
## skeleton_center 8.300000e+01 15.24 264.47
## skeleton_closeness_000_002 1.000000e+00 2.42 3.85
## skeleton_density 1.000000e+00 1.58 1.17
## skeleton_closeness_030_040 4.300000e+02 24.01 679.08
## skeleton_deg_4 3.600000e+01 15.20 239.39
## skeleton_deg_0 1.000000e+00 2.42 3.85
## skeleton_deg_1 3.100000e+01 3.36 17.35
## skeleton_deg_2 3.200000e+02 2.36 6.56
## skeleton_deg_3 3.500000e+01 3.95 22.14
## skeleton_graph_clique_number 3.000000e+00 -0.89 7.30
## skeleton_nodes 6.670000e+02 4.03 26.10
## skeleton_cycles 1.329000e+03 16.12 263.60
## skeleton_cycle_5 6.700000e+01 15.90 253.79
## skeleton_closeness_050_plus 2.100000e+01 0.76 -0.89
## skeleton_periphery 1.100000e+01 6.07 94.55
## local_volume 9.575940e+03 3.25 13.18
## local_electrons 1.708600e+02 2.48 7.20
## local_mean 2.400000e-01 3.51 21.67
## local_std 9.300000e-01 3.71 20.66
## local_min 0.000000e+00 NaN NaN
## local_max 2.698000e+01 7.64 81.96
## local_max_over_std 9.860000e+01 4.22 27.93
## local_skewness 2.320000e+00 4.91 37.21
## local_cut_by_mainchain_volume 1.013000e+01 3.27 12.88
## local_near_cut_count_C 4.100000e+01 2.13 5.99
## local_near_cut_count_other 3.000000e+00 8.93 86.67
## local_near_cut_count_S 6.000000e+00 5.16 31.22
## se
## blob_coverage* 9.130000e+00
## res_coverage* 9.150000e+00
## title* 7.710000e+00
## pdb_code* 1.350000e+00
## res_name* 1.230000e+00
## res_id 2.731000e+01
## chain_id* 7.000000e-02
## blob_volume_coverage 1.000000e-02
## blob_volume_coverage_second 0.000000e+00
## res_volume_coverage 1.000000e-02
## res_volume_coverage_second 1.000000e-02
## local_res_atom_count 4.800000e-01
## local_res_atom_non_h_count 4.700000e-01
## local_res_atom_non_h_occupancy_sum 4.600000e-01
## local_res_atom_non_h_electron_sum 3.100000e+00
## local_res_atom_non_h_electron_occupancy_sum 3.060000e+00
## local_res_atom_C_count 3.500000e-01
## local_res_atom_N_count 7.000000e-02
## local_res_atom_O_count 1.100000e-01
## local_res_atom_S_count 2.000000e-02
## dict_atom_non_h_count 4.700000e-01
## dict_atom_non_h_electron_sum 3.120000e+00
## dict_atom_C_count 3.500000e-01
## dict_atom_N_count 7.000000e-02
## dict_atom_O_count 1.100000e-01
## dict_atom_S_count 2.000000e-02
## skeleton_data* 9.160000e+00
## skeleton_cycle_4 9.800000e-01
## skeleton_diameter 9.500000e-01
## skeleton_cycle_6 4.000000e-02
## skeleton_cycle_7 3.000000e-02
## skeleton_closeness_006_008 4.000000e-01
## skeleton_closeness_002_004 2.000000e-02
## skeleton_cycle_3 3.400000e-01
## skeleton_avg_degree 2.000000e-02
## skeleton_closeness_004_006 2.800000e-01
## skeleton_closeness_010_012 3.200000e-01
## skeleton_closeness_012_014 3.100000e-01
## skeleton_edges 3.640000e+00
## skeleton_radius 4.800000e-01
## skeleton_cycle_8_plus 7.000000e-01
## skeleton_closeness_020_030 3.900000e-01
## skeleton_deg_5_plus 1.050000e+00
## skeleton_closeness_016_018 2.000000e-01
## skeleton_closeness_008_010 4.000000e-01
## skeleton_closeness_018_020 1.800000e-01
## skeleton_average_clustering 0.000000e+00
## skeleton_closeness_040_050 8.900000e-01
## skeleton_closeness_014_016 2.800000e-01
## skeleton_center 1.300000e-01
## skeleton_closeness_000_002 1.000000e-02
## skeleton_density 1.000000e-02
## skeleton_closeness_030_040 4.700000e-01
## skeleton_deg_4 6.000000e-02
## skeleton_deg_0 1.000000e-02
## skeleton_deg_1 1.000000e-01
## skeleton_deg_2 1.430000e+00
## skeleton_deg_3 1.200000e-01
## skeleton_graph_clique_number 1.000000e-02
## skeleton_nodes 1.930000e+00
## skeleton_cycles 2.200000e+00
## skeleton_cycle_5 1.200000e-01
## skeleton_closeness_050_plus 1.800000e-01
## skeleton_periphery 2.000000e-02
## local_volume 3.838000e+01
## local_electrons 8.000000e-01
## local_mean 0.000000e+00
## local_std 0.000000e+00
## local_min 0.000000e+00
## local_max 6.000000e-02
## local_max_over_std 2.900000e-01
## local_skewness 1.000000e-02
## local_cut_by_mainchain_volume 4.000000e-02
## local_near_cut_count_C 1.900000e-01
## local_near_cut_count_other 1.000000e-02
## local_near_cut_count_S 2.000000e-02
## [ osiągnięto getOption("max.print") -- pominięto 336 wierszy]
stats <- data.withoutGaps %>% select(attrib.legal) %>% summarise_all(funs(min,max))
attrib.empty <- c()
for ( col in attrib.legal){
max <- stats[1,paste(col,"_max",sep="")]
min <- stats[1,paste(col,"_min",sep="")]
diff <- max - min
if (diff == 0){
attrib.empty <- c(attrib.empty,col)
}
}top50 <- data.withoutGaps %>% group_by(!!treat_string_as_col(attrib.target_class)) %>% summarize(numberOfExamples = n()) %>% arrange(desc(numberOfExamples)) %>% head(noTopClasses)
top50## # A tibble: 50 x 2
## res_name numberOfExamples
## <fct> <int>
## 1 SO4 116
## 2 HEM 95
## 3 GOL 64
## 4 NAG 54
## 5 ZN 40
## 6 MLY 32
## 7 MG 30
## 8 CD 25
## 9 K 24
## 10 CA 21
## # ... with 40 more rows
top50 <- top50 %>% select(c(attrib.target_class)).topClasses<-unlist(top50[,attrib.target_class],use.names = FALSE)
data.top50 <- data.withoutGaps %>% filter( !!treat_string_as_col(attrib.target_class) %in% .topClasses )
data.withoutEmpty <- data.top50 %>% select(-attrib.empty)data.legal <- data.withoutEmpty %>% select(setdiff(attrib.legal,attrib.empty))
correlation <- cor(data.legal)melted <- melt(correlation)
df.cor.mel <- data.frame(melted) %>% mutate(value = abs(value)) %>% arrange(desc(value))
p<-ggplot(
df.cor.mel
,aes(x=Var1,y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "brown") +
xlab("parameters") +
ylab("parameters") +
theme(axis.text.x = element_text(angle = -90, hjust = 1))
ggplotly(p,height=700, width=700)